{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tagging the articles (in MongoDB) containing LOC and ORG tag."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mongo = MongoClient(host='10.237.26.154', port=27017)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['local', 'media-db']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mongo.list_database_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "mediaDb = mongo.get_database('media-db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = mediaDb.articles.find({\"entities.type\":\"City\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateText(text, entype, offset, length):\n",
    "    if entype == 'Person':\n",
    "        otag = '<PER>'\n",
    "        ctag = '</PER>'\n",
    "    elif entype == 'Company' or entype == 'Organization':\n",
    "        otag = '<ORG>'\n",
    "        ctag = '</ORG>'\n",
    "    elif entype == 'City':\n",
    "        otag = '<LOC>'\n",
    "        ctag = '</LOC>'\n",
    "    elif entype == 'ProvinceOrState':\n",
    "        otag = '<STA>'\n",
    "        ctag = '</STA>'\n",
    "    else :\n",
    "        otag = '<MISC>'\n",
    "        ctag = '</MISC>'\n",
    "    p1 = text[:offset]\n",
    "    p2 = otag + text[offset:offset+length] + ctag\n",
    "    p3 = text[offset+length:]\n",
    "    \n",
    "    shift = len(p2) - length\n",
    "\n",
    "    return (p1 + p2 + p3, shift)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateOffset(entities, old_offset, shift):\n",
    "    for e in entities:\n",
    "        for inst in e['instances']:\n",
    "            if inst['offset'] > old_offset:\n",
    "                inst['offset']+=shift\n",
    "    return entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "entitiesNotFound = 0\n",
    "noOfArticles = 0\n",
    "fileName = './articles_loc_and_org.txt'\n",
    "fout = open(fileName, 'w')\n",
    "itr = 0\n",
    "for c in cursor:\n",
    "    text = c['text']\n",
    "    ctr = 0\n",
    "    entities = c['entities']\n",
    "    cityPresent = False\n",
    "    compPresent = False\n",
    "    num_of_city = 0\n",
    "    city_offset = 0\n",
    "    while ctr < len(entities):\n",
    "        e = entities[ctr]\n",
    "        type_ = e['type']\n",
    "        if type_ == 'City':\n",
    "            cityPresent = True\n",
    "            num_of_city += 1\n",
    "        elif type_ == 'Company' or type_ == 'Organization':\n",
    "            compPresent = True\n",
    "        name = e['name']\n",
    "        for i in e['instances']:\n",
    "            if e['type'] == 'City':\n",
    "                city_offset = 0\n",
    "            leng = i['length']\n",
    "            offset = i['offset']\n",
    "            text, shift = updateText(text, type_, offset, leng)\n",
    "            entities = updateOffset(entities, offset, shift)\n",
    "        ctr += 1\n",
    "    if num_of_city == 1 and city_offset == 0: # not a valid article- first word is generally the location of bureau\n",
    "        pass\n",
    "    elif cityPresent and compPresent:\n",
    "        print(text, file=fout, flush=True)\n",
    "        noOfArticles += 1\n",
    "    itr += 1\n",
    "print('Articles processed-', itr)\n",
    "print('Articles containing both company and the location tags', noOfArticles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "itr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "noOfArticles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
